environmental data

ed <- raster::stack('C:/Users/thoval/Documents/Analyses/Data/lcm_had_elev_national_grid.gri')
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on Airy 1830 ellipsoid in CRS
## definition
names(ed)
##  [1] "sea"                   "broad_wood"            "conif_wood"           
##  [4] "arable"                "impr_grass"            "neutr_grass"          
##  [7] "calc_grass"            "acid_grass"            "fen_marsh_swamp"      
## [10] "heather"               "heath_grass"           "bog"                  
## [13] "inland_rock"           "saltwater"             "freshwater"           
## [16] "sup_lit_rock"          "sup_lit_sed"           "lit_rock"             
## [19] "lit_sed"               "saltmarsh"             "urban"                
## [22] "suburban"              "AnnualTemp"            "MeanDiRange"          
## [25] "Isotherm"              "TempSeasonality"       "MaxTempWarmestMonth"  
## [28] "MinTempColdestMonth"   "TempAnnualRange"       "MeanTempWetQuarter"   
## [31] "MeanTempDriestQuarter" "MeanTempWarmQuarter"   "MeanTempColdQuarter"  
## [34] "AnnualPrecip"          "PrecipWetMonth"        "PrecipDriestMonth"    
## [37] "PrecipSeasonality"     "PrecipWettestQuarter"  "PrecipDriestQuarter"  
## [40] "PrecipWarmQuarter"     "PrecipColdQuarter"     "elevation_UK"         
## [43] "slope"                 "aspect"
## big problem with slope and aspect!!!
ed <- dropLayer(ed, i = match(c("slope", "aspect"), names(ed)))

slope <- terrain(ed[[42]], opt = "slope", unit = "degrees")
aspect <- terrain(ed[[42]], opt = "aspect", unit = "degrees")

ed <- raster::stack(ed, slope, aspect)
names(ed)
##  [1] "sea"                   "broad_wood"            "conif_wood"           
##  [4] "arable"                "impr_grass"            "neutr_grass"          
##  [7] "calc_grass"            "acid_grass"            "fen_marsh_swamp"      
## [10] "heather"               "heath_grass"           "bog"                  
## [13] "inland_rock"           "saltwater"             "freshwater"           
## [16] "sup_lit_rock"          "sup_lit_sed"           "lit_rock"             
## [19] "lit_sed"               "saltmarsh"             "urban"                
## [22] "suburban"              "AnnualTemp"            "MeanDiRange"          
## [25] "Isotherm"              "TempSeasonality"       "MaxTempWarmestMonth"  
## [28] "MinTempColdestMonth"   "TempAnnualRange"       "MeanTempWetQuarter"   
## [31] "MeanTempDriestQuarter" "MeanTempWarmQuarter"   "MeanTempColdQuarter"  
## [34] "AnnualPrecip"          "PrecipWetMonth"        "PrecipDriestMonth"    
## [37] "PrecipSeasonality"     "PrecipWettestQuarter"  "PrecipDriestQuarter"  
## [40] "PrecipWarmQuarter"     "PrecipColdQuarter"     "elevation_UK"         
## [43] "slope"                 "aspect"
plot(ed[[43]], main = names(ed[[43]]), ylim=c(780000, 860000), xlim = c(60000, 200000))

plot(ed[[44]], main = names(ed[[44]]), ylim=c(780000, 860000), xlim = c(60000, 200000))

Get area of interest and drop correlated weather variables

ext_h <- extent(matrix(c(-4,53, 0.2,54.5), ncol = 2))
e <- as(ext_h, "SpatialPolygons")
sp::proj4string(e) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

e.geo <- sp::spTransform(e, CRS("+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +units=m +no_defs"))
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on Airy 1830 ellipsoid in CRS
## definition
hbv_y <- raster::crop(ed, e.geo)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on Airy 1830 ellipsoid in CRS
## definition
plot(hbv_y[[43]], main = names(hbv_y[[43]]))

# exclude variables with >0.7 correlation
whichVars <- usdm::vifcor(hbv_y[[26:41]], th = 0.7)
whichVars
## 11 variables from the 16 input variables have collinearity problem: 
##  
## AnnualPrecip PrecipWettestQuarter PrecipDriestQuarter PrecipWetMonth PrecipColdQuarter MaxTempWarmestMonth MeanTempColdQuarter PrecipDriestMonth TempAnnualRange MeanTempWarmQuarter PrecipSeasonality 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( MeanTempDriestQuarter ~ MinTempColdestMonth ):  0.1300289 
## max correlation ( PrecipWarmQuarter ~ TempSeasonality ):  -0.6616233 
## 
## ---------- VIFs of the remained variables -------- 
##               Variables      VIF
## 1       TempSeasonality 2.372025
## 2   MinTempColdestMonth 1.843731
## 3    MeanTempWetQuarter 1.840476
## 4 MeanTempDriestQuarter 1.270329
## 5     PrecipWarmQuarter 2.679469
hbv_y <- dropLayer(x=hbv_y, i = match(whichVars@excluded, names(hbv_y)))
# plot(hbv_y)
names(hbv_y)
##  [1] "sea"                   "broad_wood"            "conif_wood"           
##  [4] "arable"                "impr_grass"            "neutr_grass"          
##  [7] "calc_grass"            "acid_grass"            "fen_marsh_swamp"      
## [10] "heather"               "heath_grass"           "bog"                  
## [13] "inland_rock"           "saltwater"             "freshwater"           
## [16] "sup_lit_rock"          "sup_lit_sed"           "lit_rock"             
## [19] "lit_sed"               "saltmarsh"             "urban"                
## [22] "suburban"              "AnnualTemp"            "MeanDiRange"          
## [25] "Isotherm"              "TempSeasonality"       "MinTempColdestMonth"  
## [28] "MeanTempWetQuarter"    "MeanTempDriestQuarter" "PrecipWarmQuarter"    
## [31] "elevation_UK"          "slope"                 "aspect"
ht <- hbv_y

plot(ht[[1]], main = names(ht[[1]]))

Species data

Plot number of data points in whole UK and for subset

dfm_df <- read_csv("C:/Users/thoval/Documents/Analyses/DayFlyingMoths.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   TO_STARTDATE = col_character(),
##   TO_ENDDATE = col_character(),
##   TO_GRIDREF = col_character(),
##   TO_LOCALITY = col_character(),
##   SQ_10 = col_character(),
##   CONCEPT = col_character(),
##   date = col_date(format = ""),
##   sp_n = col_character(),
##   com_n = col_character(),
##   ig = col_character(),
##   ag = col_character(),
##   id = col_character()
## )
## i Use `spec()` for the full column specifications.
head(dfm_df)
## # A tibble: 6 x 25
##      X1  X1_1 TO_ID TO_STARTDATE TO_ENDDATE DT_ID TO_GRIDREF TO_LOCALITY GR_ID
##   <dbl> <dbl> <dbl> <chr>        <chr>      <dbl> <chr>      <chr>       <dbl>
## 1     1   165  1410 13/07/2005   13/07/2005     1 NM430268   Isle of Mu~     1
## 2     2   166  1411 13/07/2005   13/07/2005     1 NM430268   Isle of Mu~     1
## 3     3  1466  6557 02/07/2005   02/07/2005     1 SK322596   Ash Brook,~     1
## 4     4  1937 10384 16/07/2002   16/07/2002     1 SV918113   St Mary's ~     1
## 5     5  2393 13520 21/07/2004   21/07/2004     1 SK440660   Holmewood ~     1
## 6     6  2394 13521 21/07/2004   21/07/2004     1 SK440660   Holmewood ~     1
## # ... with 16 more variables: VC_ID <dbl>, SQ_10 <chr>, CONCEPT <chr>,
## #   TO_PRECISION <dbl>, LATITUDE <dbl>, LONGITUDE <dbl>, lon <dbl>, lat <dbl>,
## #   date <date>, yr <dbl>, jd <dbl>, sp_n <chr>, com_n <chr>, ig <chr>,
## #   ag <chr>, id <chr>
# get eastings and northings
dfm_df <- c_en(dfm_df)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum OSGB 1936 in CRS definition
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on Airy 1830 ellipsoid in CRS
## definition
dfm_df %>% group_by(sp_n) %>% tally %>% 
  ggplot(aes(x = reorder(sp_n, n), y = n)) +
  geom_point() +
  theme(axis.text.x = element_text(angle = 50, hjust=1)) +
  xlab("species") + ggtitle("All UK") +
  ylab("Number of sightings")

sp_y <- subset(dfm_df, lat > 53.1 & lat <= 54.4 &
                 lon > -3.9 & lon < 0.2) %>% 
  mutate(species = sp_n,
         year = yr)

sp_y %>% group_by(sp_n) %>% tally %>% 
  ggplot(aes(x = reorder(sp_n, n), y = n)) +
  geom_point() +
  theme(axis.text.x = element_text(angle = 50, hjust=1)) +
  xlab("species") + ggtitle("Subset UK, 53.1 < latitude <= 54.4") +
  ylab("Number of sightings")

pseudoabsences

#### Create presence/absence
spp <- unique(sp_y$species)

# as a hack to not have to create a new function for east and north
# create a data frame that has East and north renamed as lon lat
ndf <- sp_y %>% mutate(lon = EASTING,
                       lat = NORTHING) %>% 
  dplyr::select(-EASTING, -NORTHING)


registerDoParallel(cores = detectCores() - 1)

system.time( 
  ab1 <- foreach(i = 1 : length(spp)) %dopar% {
    
    cpa(spdat = ndf, species = spp[i], matchPres = TRUE,
        minYear = 2000, maxYear = 2017, recThresh = 5)
    
  }
)
##    user  system elapsed 
##    0.13    0.19    0.99
registerDoSEQ()

names(ab1) <- spp

Logistic regression

Parallelised lr sdm with 10 k-fold validations to get bootstrapped errors. ~10 minute run time

Set the K value to the number of bootstraps that you want for the model. This randomly samples the data and fits a model k times. The models are stored in the output object and so then can be used to predict the probability of presence.

# do all species in parallel to save time
registerDoParallel(cores = detectCores() - 1)

names(ab1) <- spp
system.time(
  spp_lr_out <- foreach(s = 1:length(spp), #.combine='comb', .multicombine=TRUE,
                        # .init=list(list()), 
                        .packages = c('tidyverse', 'raster', 'readr', 'dismo'),
                        .errorhandling = 'pass') %dopar% {
                          
                          # print(paste(s, spp[s], sep = " "))
                          
                          if(is.null(ab1[s])){
                            # print(paste("No data for species", spp[s]))
                            next
                          }
                          
                          sdm_lr <- fsdm(species = spp[s], model = "lr", 
                                         climDat = ht, spData = ab1, knots = -1,
                                         k = 10,
                                         prediction = T,
                                         inters = F, 
                                         write =  F, outPath = "C:/Users/thoval/Documents/Analyses/lr_outs/")
                          
                          se_out <- predict(ht, sdm_lr$Model, type = "response", se.fit = TRUE, index = 2)
                          # save(se_out, file = paste0("C:/Users/thoval/Documents/Analyses/lr_outs/", spp[s], "_lr_se_error_prediction.rdata"))
                          
                          list(sdm_lr, se_out)
                        }
)
##    user  system elapsed 
##   19.29   28.05  604.58
registerDoSEQ()

The output of this function is a bit odd as it has a list within another list. The first list is each separate species, and the second list is the output from the fsdm function [[1]] and the standard error [[2]]. Obviously, with models that do not have a standard error prediction there will be no entry in the second list.

predict from each bootstrapped model

At the moment, just a regular for loop that predicts the probability of presence for the whole region of interest which makes it slow. The predict function has an extent argument meaning that we can predict for smaller subsets of the overall distribution of a species.

First let’s choose the extent

ext <- extent(matrix(c(340000,450000, 400000,490000), ncol = 2))

plot(spp_lr_out[[2]][[1]]$Predictions, main = spp_lr_out[[2]][[1]]$Species)
points(x = spp_lr_out[[2]][[1]]$Data$lon[spp_lr_out[[2]][[1]]$Data$val == 1],
       spp_lr_out[[2]][[1]]$Data$lat[spp_lr_out[[2]][[1]]$Data$val == 1],
       cex = 0.6, col = "red", pch = 20)
plot(ext, add = T)

test_area <- crop(spp_lr_out[[2]][[1]]$Predictions, ext) 
plot(test_area, main = paste(spp_lr_out[[2]][[1]]$Species, 'cropped'))

Slow! ~

There is probably a way to speed this up but I need to figure out how parallelise nested for loops which is tricky because of the needed to catch and skip errors.

I think the way to do this is to change the second for loop to be s in 1:20 (i.e. number of bootstrapped models rather than the length of the object itself). This is because the errors form because some of the spp_lr_out[[]] objects are null so can’t get the length of an object like that. Then use the %:% and %dopar% functions to send it to multiple cores.

spp_out_boot <- list()

# beginCluster(n = 7)

system.time(
  for(s in 1:length(spp_lr_out)){
    
    # some entries in the list are NULL because of too few data points in the kfold cross validation part of the model above
    skip_to_next <- FALSE
    
    # find the error
    tryCatch({ 
      bt_mods_loc <- spp_lr_out[[s]][[1]]$Bootstrapped_models
    },
    error = function(e) {skip_to_next <<- TRUE})
    
    # skip to next if error
    if(skip_to_next){print(paste("SKIPPING LIST ENTRY", s))
      next}
    
    print(paste(s, spp_lr_out[[s]][[1]]$Species))
    
    
    pred_out_boot <- list()
    for(b in 1:length(spp_lr_out[[s]][[1]]$Bootstrapped_models)) {
      # print(b)
      
      # predict from each bootsrapped model
      b_t <- spp_lr_out[[s]][[1]]$Bootstrapped_models[[b]]
      
      # non-parallel
      p_t <- suppressWarnings(predict(ht, b_t, index = NULL, type = 'response', ext = ext))
      
      # parallel
      # p_t <- clusterR(ht, predict, args = list(b_t, type = 'response', index = NULL, ext = ext), export='ext')
      
      # store
      pred_out_boot[[b]] <- p_t
    }
    
    # stack all the predictions and store as outputs
    boots <- do.call("stack", pred_out_boot)
    spp_out_boot[[spp_lr_out[[s]][[1]]$Species]] <- boots
    
  }
)
## [1] "1 Tyria jacobaeae"
## [1] "2 Zygaena filipendulae"
## [1] "3 Zygaena lonicerae"
## [1] "4 Chiasmia clathrata"
## [1] "5 Odezia atrata"
## [1] "6 Ematurga atomaria"
## [1] "7 Perizoma albulata"
## [1] "8 Archiearis parthenias"
## [1] "9 Euclidia mi"
## [1] "10 Pseudopanthera macularia"
## [1] "11 Anarta myrtilli"
## [1] "12 Parasemia plantaginis"
## [1] "13 Panemeria tenebrata"
## [1] "14 Adscita statices"
## [1] "15 Scotopteryx bipunctaria"
## [1] "16 Lycia zonaria"
## [1] "17 Orgyia antiqua"
## [1] "18 Adscita geryon"
## [1] "19 Euclidia glyphica"
## [1] "20 Perconia strigillaria"
## [1] "21 Orgyia recens"
## [1] "22 Photedes captiuncula"
## [1] "23 Phytometra viridaria"
## [1] "24 Idaea muricata"
## [1] "25 Rheumaptera hastata"
## [1] "26 Epirrhoe tristata"
## [1] "27 Hemaris fuciformis"
## [1] "SKIPPING LIST ENTRY 28"
## [1] "SKIPPING LIST ENTRY 29"
## [1] "SKIPPING LIST ENTRY 30"
##    user  system elapsed 
##  233.11   60.31  297.08
# endCluster()

This sometimes throws an error because of memory allocation (cannot allocate vector size xxxMB) when using larger extents.

error for Zygaena lonicerae

The quantile/range function takes a long time when projecting to a large area, with the extent that we decided on above it’s not too bad (~<1min).

This block of code calculates the 0.05 and 0.95 quantile in each cell of each of the bootstrapped predictions, subtracts the lower from the upper and then plots the result.

# registerDoParallel(cores = detectCores() - 1)
zl_q <- calc(spp_out_boot$`Zygaena lonicerae` , fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)})

# system.time(
#   zl_q <- clusterR(spp_out_boot$`Zygaena lonicerae`, calc, 
#                    agrs = list(fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)}))
# )
# 
# registerDoSEQ()

zl_q
## class      : RasterBrick 
## dimensions : 400, 600, 240000, 2  (nrow, ncol, ncell, nlayers)
## resolution : 100, 100  (x, y)
## extent     : 340000, 4e+05, 450000, 490000  (xmin, xmax, ymin, ymax)
## crs        : +proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +units=m +no_defs 
## source     : memory
## names      :          X5.,         X95. 
## min values : 1.999644e-08, 1.061109e-04 
## max values :     0.959281,     1.000000
zl_q_25_75 <- zl_q[[2]]-zl_q[[1]]

# names(ab1) # zygaena filip is second in the list

par(mfrow = c(1,2))
plot(test_area, main = paste(spp_lr_out[[2]][[1]]$Species, "distribution"))
# points(x = spp_lr_out[[2]][[1]]$Data$lon[spp_lr_out[[2]][[1]]$Data$val == 1], 
#        spp_lr_out[[2]][[1]]$Data$lat[spp_lr_out[[2]][[1]]$Data$val == 1],
#        cex = 0.6, col = "red", pch = 20)
plot(zl_q_25_75, main = "95% - 5% quantile")
points(x = spp_lr_out[[2]][[1]]$Data$lon[spp_lr_out[[2]][[1]]$Data$val == 1],
       spp_lr_out[[2]][[1]]$Data$lat[spp_lr_out[[2]][[1]]$Data$val == 1],
       cex = 0.6, col = "red", pch = 20)

par(mfrow = c(1,1))

code for error across all species

No let’s do the same for all of the species in the bootstrapped outputs. Parallelise this!!! It would be easy…

length(spp_out_boot)
## [1] 27
# boot_out <- vector(mode = "list", length = length(spp_out_boot))
# 
# # beginCluster(7)
# 
# for(bs in 1:length(spp_out_boot)){
#   print(bs)
#   
#   sp_bts <- spp_out_boot[[bs]]
#   
#   
#   # zl_q <- calc(spp_out_boot$`Zygaena lonicerae` , fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)})
#   # sp_q <- clusterR(sp_bts, calc, 
#   #                  agrs = list(fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)}))
#   
#   sp_q <- calc(sp_bts, fun = function(x) {quantile(x, probs = c(0.05, 0.95),
#                                                    na.rm = T)})
#   
#   sp_q_25_75 <- sp_q[[2]]-sp_q[[1]]
#   
#   names(sp_q_25_75) <- names(spp_out_boot)[bs]
#   
#   boot_out[[bs]] <- sp_q_25_75
#   
# }
# 
# # endCluster()


registerDoParallel(cores = detectCores() - 1)

system.time(
  boot_out <- foreach(bs = 1:length(spp_out_boot), #.combine='comb', .multicombine=TRUE,
                      # .init=list(list()), 
                      .packages = c('raster'),
                      .errorhandling = 'pass') %dopar% {
                        
                        sp_bts <- spp_out_boot[[bs]]
                        
                        
                        # zl_q <- calc(spp_out_boot$`Zygaena lonicerae` , fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)})
                        # sp_q <- clusterR(sp_bts, calc, 
                        #                  agrs = list(fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)}))
                        
                        sp_q <- calc(sp_bts, fun = function(x) {quantile(x, probs = c(0.05, 0.95),
                                                                         na.rm = T)})
                        
                        sp_q_25_75 <- sp_q[[2]]-sp_q[[1]]
                        
                        names(sp_q_25_75) <- names(spp_out_boot)[bs]
                        
                        sp_q_25_75
                        
                      })
##    user  system elapsed 
##    3.62    3.14  309.44
registerDoSEQ()

Plot all of the species together. It’s important to remember that these plots are still from logistic regression models so don’t judge it too harshly. In the final models the predictions the variation layers could be from better models or even a weighted average/summed total of the predictions from multiple different models.

# 
# 
# length(boot_out)
#
par(mfrow = c(2,3))
for(j in c(1:26)){
  if(j<20){
    plot(spp_lr_out[[j]][[1]]$Predictions, main = spp_lr_out[[j]][[1]]$Species, 
         xlim = c(340000, 400000), 
         ylim = c(450000, 490000))
    plot(spp_lr_out[[j]][[2]], main = paste(spp_lr_out[[j]][[1]]$Species, 'Standard error'), 
         xlim = c(340000, 400000), 
         ylim = c(450000, 490000))
    plot(boot_out[[j]], main = paste(names(boot_out[[j]]), '95% - 5% quantile'))
  } else if(j>=20){
    
    plot(spp_lr_out[[j+1]][[1]]$Predictions, main = spp_lr_out[[j+1]][[1]]$Species, 
         xlim = c(340000, 400000), 
         ylim = c(450000, 490000))
    plot(spp_lr_out[[j+1]][[2]], main = paste(spp_lr_out[[j+1]][[1]]$Species, 'Standard error'), 
         xlim = c(340000, 400000), 
         ylim = c(450000, 490000))
    plot(boot_out[[j]], main = paste(names(boot_out[[j]]), '95% - 5% quantile'))
  }
}

par(mfrow = c(1,1))